Setup the Environment

CSS Styling
writeLines("td, th { padding : 3px } th { background-color:white; color:black; border:1px solid black; text-align:center } td {color:black; border:1px solid black; word-wrap:break-word; white-space:nowrap; overflow: hidden; text-overflow: ellipsis; max-width:300px; text-align:left}", con= "../css/style.css")

Comparison of Samples and Metabolites Between Replicate Pairs

Boxplots (Original Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
p <- plot_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
             x = "Abundance", y = "") 
plot(p)

Zero/Missing Values in Metabolites (Original Abundances)

# Identify Zero/Missing Values in Metabolites
################################################################################
na_df <- plot_df %>%
  ungroup() %>% 
  group_by(METABOLITE_NAME, REPLICATE) %>%
  mutate(ZERO_LOG = ifelse(VALUE == 0, 1, 0)) %>%
  mutate(ZERO_N = sum(ZERO_LOG)) %>%
  mutate(ZERO_FREQ = round(ZERO_N/length(shared_sams), digits = 2)) %>%
  mutate(NA_LOG = ifelse(is.na(VALUE), 1, 0)) %>%
  mutate(NA_N = sum(NA_LOG)) %>%
  mutate(NA_FREQ = round(NA_N/length(shared_sams), digits = 2)) %>%
  select(REPLICATE, METABOLITE_NAME, ZERO_N, ZERO_FREQ, NA_N, NA_FREQ) %>%
  unique() %>%
  arrange(REPLICATE, desc(ZERO_FREQ), desc(NA_FREQ)) %>%
  ungroup()

na_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "400px")
REPLICATE METABOLITE_NAME ZERO_N ZERO_FREQ NA_N NA_FREQ
610 Aminoisobutyric acid 78 1.00 0 0
610 5-Hydroxylysine 78 1.00 0 0
610 Homocysteine 72 0.92 0 0
610 3-Methylhistidine 66 0.85 0 0
610 Cystathionine 59 0.76 0 0
610 Carnosine 54 0.69 0 0
610 Anserine 37 0.47 0 0
610 1-Methylhistidine 16 0.21 0 0
610 beta-Alanine 5 0.06 0 0
610 Alanine 0 0.00 0 0
610 Aminoadipic acid 0 0.00 0 0
610 2-Aminobutyric acid 0 0.00 0 0
610 Arginine 0 0.00 0 0
610 Asparagine 0 0.00 0 0
610 Aspartic acid 0 0.00 0 0
610 Citrulline 0 0.00 0 0
610 Cysteine 0 0.00 0 0
610 Ethanolamine 0 0.00 0 0
610 gamma-Aminobutyric acid 0 0.00 0 0
610 Glutamic acid 0 0.00 0 0
610 Glutamine 0 0.00 0 0
610 Glycine 0 0.00 0 0
610 Histidine 0 0.00 0 0
610 Hydroxyproline 0 0.00 0 0
610 Isoleucine 0 0.00 0 0
610 Leucine 0 0.00 0 0
610 Lysine 0 0.00 0 0
610 Methionine 0 0.00 0 0
610 Ornithine 0 0.00 0 0
610 Phenylalanine 0 0.00 0 0
610 Phosphoethanolamine 0 0.00 0 0
610 Proline 0 0.00 0 0
610 Sarcosine 0 0.00 0 0
610 Serine 0 0.00 0 0
610 Taurine 0 0.00 0 0
610 Threonine 0 0.00 0 0
610 Tryptophan 0 0.00 0 0
610 Tyrosine 0 0.00 0 0
610 Valine 0 0.00 0 0
611 Aminoisobutyric acid 78 1.00 0 0
611 5-Hydroxylysine 78 1.00 0 0
611 3-Methylhistidine 67 0.86 0 0
611 Homocysteine 63 0.81 0 0
611 Carnosine 53 0.68 0 0
611 Cystathionine 52 0.67 0 0
611 Anserine 30 0.38 0 0
611 1-Methylhistidine 16 0.21 0 0
611 beta-Alanine 1 0.01 0 0
611 Alanine 0 0.00 0 0
611 Aminoadipic acid 0 0.00 0 0
611 2-Aminobutyric acid 0 0.00 0 0
611 Arginine 0 0.00 0 0
611 Asparagine 0 0.00 0 0
611 Aspartic acid 0 0.00 0 0
611 Citrulline 0 0.00 0 0
611 Cysteine 0 0.00 0 0
611 Ethanolamine 0 0.00 0 0
611 gamma-Aminobutyric acid 0 0.00 0 0
611 Glutamic acid 0 0.00 0 0
611 Glutamine 0 0.00 0 0
611 Glycine 0 0.00 0 0
611 Histidine 0 0.00 0 0
611 Hydroxyproline 0 0.00 0 0
611 Isoleucine 0 0.00 0 0
611 Leucine 0 0.00 0 0
611 Lysine 0 0.00 0 0
611 Methionine 0 0.00 0 0
611 Ornithine 0 0.00 0 0
611 Phenylalanine 0 0.00 0 0
611 Phosphoethanolamine 0 0.00 0 0
611 Proline 0 0.00 0 0
611 Sarcosine 0 0.00 0 0
611 Serine 0 0.00 0 0
611 Taurine 0 0.00 0 0
611 Threonine 0 0.00 0 0
611 Tryptophan 0 0.00 0 0
611 Tyrosine 0 0.00 0 0
611 Valine 0 0.00 0 0
# Remove Metabolites with high NAor Zero Frequencies
################################################################################
met_rm <- na_df %>%
  filter((ZERO_FREQ >= 0.95) | (NA_FREQ >= 0.8)) %>%
  select(METABOLITE_NAME) %>% unlist() %>% unique()
metabolites <- metabolites[metabolites %!in% met_rm]
plot_df <- plot_df %>%
  filter(METABOLITE_NAME %in% metabolites)
plot_df$METABOLITE_NAME <- as.character(plot_df$METABOLITE_NAME)

Distributions (Original Abundances)

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in c(reps)){
  # Collect numeric vector
  num_vec <- plot_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
NA_COUNT NA_FREQ MEAN MEDIAN MAX MIN MID_RANGE VARIANCE STD_DEV SE_MEAN Q1 Q3 KURTOSIS SKEW BC_LAMBDA
611 0 0 3.31 0.75 47.28 0 47.28 50.57 7.11 0.13 0.11 2.35 10.81 3.3 None
610 0 0 3.20 0.72 44.96 0 44.96 47.17 6.87 0.13 0.10 2.36 10.75 3.3 None
# Plot the density plot for all the gene counts
################################################################################
plot_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  xlim(min(df_all$MIN), mean(df_all$Q3)) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

Metabolite by Metabolite Correlation (Original Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}


# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Original Abundances)

# Subset the data for sites to compare (scatterplots)
###################################################
plot_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  filter(METABOLITE_NAME %in% metabolites) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                              grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                              )) %>%
  select(REPLICATE,METABOLITE_NAME,labelid,VALUE) %>%
  mutate(REPLICATE = as.character(REPLICATE)) %>%
  split(.$REPLICATE) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots
################################################################################
# Plot scatter plots ordered by sample (include R2 values)
lm_df <- plot_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.92
Sarcosine 0.768
Hydroxyproline 0.435
Alanine 0.364
Aspartic acid 0.352
Tryptophan 0.261
Glutamic acid 0.238
Ornithine 0.182
Tyrosine 0.163
Anserine 0.112
Threonine 0.0956
Asparagine 0.0919
Citrulline 0.0892
Cystathionine 0.0874
Glutamine 0.0848
Valine 0.0716
Aminoadipic acid 0.0571
beta-Alanine 0.0485
Isoleucine 0.0442
3-Methylhistidine 0.0423
Phosphoethanolamine 0.0387
Proline 0.0321
1-Methylhistidine 0.0293
Carnosine 0.028
Lysine 0.0205
gamma-Aminobutyric acid 0.0201
Ethanolamine 0.018
Methionine 0.0138
Homocysteine 0.00987
Glycine 0.00965
Leucine 0.00811
Phenylalanine 0.0058
Taurine 0.00521
Serine 0.00448
Cysteine 0.00444
Arginine 0.00249
Histidine 0.000709
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot the scatter plots
for(metab in scat_met_order){
  p <- plot_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

Sample by Sample Correlations (Original Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Original Abundances)

pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2

Normalization (AutoScale)

# Normalize
# Note in this step, we use labelid to account for mayo's duplicate samples
################################################################################
norm_df <- plot_df %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  bind_rows()

Boxplots (Normalized Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
norm_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
             x = "Abundance", y = "") 

Metabolite by Metabolite Correlation (Normalized Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}

# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Normalized Abundances)

# Normalize
# Subset the data for sites to compare (scatterplots)
###################################################
norm_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                               grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  map(mutate, labelid = gsub(pattern = "_..*", replacement = '', labelid_viallabel)) %>%
  map(select, -labelid_viallabel) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots ordered by sample (include R2 values)
lm_df <- norm_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.92
Sarcosine 0.768
Hydroxyproline 0.435
Alanine 0.364
Aspartic acid 0.352
Tryptophan 0.261
Glutamic acid 0.238
Ornithine 0.182
Tyrosine 0.163
Anserine 0.112
Threonine 0.0956
Asparagine 0.0919
Citrulline 0.0892
Cystathionine 0.0874
Glutamine 0.0848
Valine 0.0716
Aminoadipic acid 0.0571
beta-Alanine 0.0485
Isoleucine 0.0442
3-Methylhistidine 0.0423
Phosphoethanolamine 0.0387
Proline 0.0321
1-Methylhistidine 0.0293
Carnosine 0.028
Lysine 0.0205
gamma-Aminobutyric acid 0.0201
Ethanolamine 0.018
Methionine 0.0138
Homocysteine 0.00987
Glycine 0.00965
Leucine 0.00811
Phenylalanine 0.0058
Taurine 0.00521
Serine 0.00448
Cysteine 0.00444
Arginine 0.00249
Histidine 0.000709
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot scatter plots (Normalized)
################################################################################
# Plot the scatter plots
for(metab in scat_met_order){
  p <- norm_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

# norm_join_df %>%
#   mutate(METABOLITE_NAME = factor(METABOLITE_NAME, levels = scat_met_order)) %>%
#   ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
#   geom_point(alpha = 0.5) +
#   geom_smooth(method = "lm", size = 1) +
#   geom_abline(linetype="dashed") +
#   facet_wrap(~ METABOLITE_NAME) +
#   expand_limits(x = 0, y = 0) +
#   labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
#   coord_fixed(ratio=1)

Distributions (Normalized Abundances)

# Plot the density plot for all the gene counts
################################################################################
norm_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in reps){
  # Collect numeric vector
  num_vec <- norm_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all
#}

Sample by Sample Correlations (Normalized Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Normalized Abundances)

# Plot a PCA with outliers labeled
# shared_sam_mat <- shared_sam_mat %>%
#   t() %>% AutoScaleMatrix() %>%
#   t()
pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2